## calculation of village, gun, and 5km Simpson stats
rm(list=ls())
library("plyr")
library("tidyverse")
library("tidymodels")
library("parallel")
library("pracma")
library("reader")
library("sf")
library("sp")
library("raster")
library("pROC")
library("caret")
library("mfx")
library("erer")
library("aod")

setwd <- getwd() ## assuming files in same folder

no_cores <- detectCores() - 1
cl <- makeCluster(no_cores)

Parcels.df <- read.delim("Parcels_sample.txt")
Parcels.df$standard_ryo_azukari <- Parcels.df$standard_ryo
## this line allows for multiple approaches to azukarichi
## because azukarichi were controlled by domains, for purposes of fragmentation calculations, they are treated as domain holdings
## for purposes of mapping, the binary "azukari" variable is used to flag loaned lands


unique_loc.df <- Parcels.df[!duplicated(Parcels.df$unique_location),] ## create data frame of unique locations
holder_finder <- function(i) { ## Simpson at village level and and largest holders at village level
  # create temporary crosstab for village level stats
  temp_table.df <- data.frame(xtabs(total_koku~standard_ryo_azukari,Parcels.df[which(Parcels.df$unique_location==unique_loc.df$unique_location[i]),]),stringsAsFactors = FALSE) ## get all holders per parcel
  Simpson <- sum((temp_table.df$Freq/sum(temp_table.df$Freq))^2)
  parcel_count <- nrow(temp_table.df)
  temp_table.df <- temp_table.df[order(-temp_table.df$Freq),] ## rank holders by size
  temp_table.df$standard_ryo_azukari  <- as.character(temp_table.df$standard_ryo_azukari)
  holder_one <- temp_table.df$standard_ryo_azukari[1] # name of holder
  holder_two <- ifelse(temp_table.df$Freq[2]>0,temp_table.df$standard_ryo_azukari[2],NA)
  holder_three <- ifelse(temp_table.df$Freq[3]>0,temp_table.df$standard_ryo_azukari[3],NA)
  holder_four <- ifelse(temp_table.df$Freq[4]>0,temp_table.df$standard_ryo_azukari[4],NA)
  holder_five <- ifelse(temp_table.df$Freq[5]>0,temp_table.df$standard_ryo_azukari[5],NA)
  holder_one_koku <- temp_table.df$Freq[1]  # size of holding
  holder_two_koku <- ifelse(temp_table.df$Freq[2]>0,temp_table.df$Freq[2],NA)
  holder_three_koku <- ifelse(temp_table.df$Freq[3]>0,temp_table.df$Freq[3],NA)
  holder_four_koku <- ifelse(temp_table.df$Freq[4]>0,temp_table.df$Freq[4],NA)
  holder_five_koku <- ifelse(temp_table.df$Freq[5]>0,temp_table.df$Freq[5],NA)
  total_kokudaka <- sum(temp_table.df$Freq,na.rm=TRUE)
  return(c(unique_loc.df$unique_location[i],unique_loc.df$kuni[i],unique_loc.df$gun[i],unique_loc.df$mura[i],unique_loc.df$postal[i],unique_loc.df$search_string[i],unique_loc.df$lon[i],unique_loc.df$lat[i],Simpson,parcel_count,holder_one,holder_two,holder_three,holder_four,holder_five,
           holder_one_koku,holder_two_koku,holder_three_koku,holder_four_koku,holder_five_koku,total_kokudaka)
)  
}

results <- mclapply(seq_along(unique_loc.df$unique_location), holder_finder)


## convert list to data frame and assign names
village.df <- data.frame(matrix(unlist(results), byrow = TRUE, nrow = length(results)),stringsAsFactors = FALSE)
colnames(village.df) <- c("unique_location","kuni","gun","mura","postal_code","search_string","lon","lat","Simpson","parcel_count","holder_one","holder_two","holder_three","holder_four","holder_five",
                          "holder_one_koku","holder_two_koku","holder_three_koku","holder_four_koku","holder_five_koku","total_kokudaka")

## categories are based on holder names and reference works 
## this table has a unique category for each holder
Parcels.df$holding_category <- ifelse(Parcels.df$azukari,"shogunal_loan",Parcels.df$holding_category)
category_table <- Parcels.df[,c("standard_ryo_azukari","holding_category")]
category_table <- category_table[!duplicated(category_table),]
unique(category_table$holding_category)
## assign categories
village.df$holder_one_cat <- mapvalues(village.df$holder_one,category_table$standard_ryo_azukari,category_table$holding_category)
village.df$holder_two_cat <- mapvalues(village.df$holder_two,category_table$standard_ryo_azukari,category_table$holding_category)
village.df$holder_three_cat <- mapvalues(village.df$holder_three,category_table$standard_ryo_azukari,category_table$holding_category)
village.df$holder_four_cat <- mapvalues(village.df$holder_four,category_table$standard_ryo_azukari,category_table$holding_category)
village.df$holder_five_cat <- mapvalues(village.df$holder_five,category_table$standard_ryo_azukari,category_table$holding_category)


village.df$holder_one_cat <- ifelse(is.na(village.df$holder_one),NA,village.df$holder_one_cat)
village.df$holder_two_cat <- ifelse(is.na(village.df$holder_two),NA,village.df$holder_two_cat)
village.df$holder_three_cat <- ifelse(is.na(village.df$holder_three),NA,village.df$holder_three_cat)
village.df$holder_four_cat <- ifelse(is.na(village.df$holder_four),NA,village.df$holder_four_cat)
village.df$holder_five_cat <- ifelse(is.na(village.df$holder_five),NA,village.df$holder_five_cat)


## create crosstabs to get percentages by each holding category
temp.df <- Parcels.df
temp.df$holding_category <- as.factor(temp.df$holding_category)
holder_type_counter <- function(i) { ## Simpson and largest holders
  temp_table.df <- data.frame(xtabs(total_koku~as.factor(holding_category),temp.df[which(temp.df$unique_location==unique_loc.df$unique_location[i]),]),stringsAsFactors = FALSE)
  temp_table.df$perc <- temp_table.df$Freq/sum(temp_table.df$Freq)*100
  return(temp_table.df$perc)
}

results <- mclapply(seq_along(unique_loc.df$unique_location), holder_type_counter)


percs.df <- data.frame(matrix(unlist(results), nrow = length(results), byrow = TRUE),stringsAsFactors = FALSE)
colnames(percs.df) <- levels(temp.df$holding_category)

village.df <- cbind(village.df,percs.df) 

## gun level calculations
i <- 1
Parcels.df$kuni_gun <- paste(Parcels.df$kuni, Parcels.df$gun, sep="_") ## creat unique marker for gun
unique_loc_kuni_gun.df <- Parcels.df[!duplicated(Parcels.df$kuni_gun),]
gun_holder_finder <- function(i) { ## Simpson and largest holders
  ## crosstabs at the county/gun level
  temp_table.df <- data.frame(xtabs(total_koku~standard_ryo_azukari,Parcels.df[which(Parcels.df$kuni_gun==unique_loc_kuni_gun.df$kuni_gun[i]),]),stringsAsFactors = FALSE)
  Simpson_gun <- sum((temp_table.df$Freq/sum(temp_table.df$Freq))^2)
  temp_table.df <- temp_table.df[order(-temp_table.df$Freq),]
  temp_table.df$standard_ryo_azukari  <- as.character(temp_table.df$standard_ryo_azukari)
  parcel_count <- nrow(temp_table.df)
  holder_one <- temp_table.df$standard_ryo_azukari[1]
  holder_two <- ifelse(temp_table.df$Freq[2]>0,temp_table.df$standard_ryo_azukari[2],NA)
  holder_three <- ifelse(temp_table.df$Freq[3]>0,temp_table.df$standard_ryo_azukari[3],NA)
  holder_four <- ifelse(temp_table.df$Freq[4]>0,temp_table.df$standard_ryo_azukari[4],NA)
  holder_five <- ifelse(temp_table.df$Freq[5]>0,temp_table.df$standard_ryo_azukari[5],NA)
  holder_one_koku <- temp_table.df$Freq[1]
  holder_two_koku <- ifelse(temp_table.df$Freq[2]>0,temp_table.df$Freq[2],NA)
  holder_three_koku <- ifelse(temp_table.df$Freq[3]>0,temp_table.df$Freq[3],NA)
  holder_four_koku <- ifelse(temp_table.df$Freq[4]>0,temp_table.df$Freq[4],NA)
  holder_five_koku <- ifelse(temp_table.df$Freq[5]>0,temp_table.df$Freq[5],NA)
  total_kokudaka <- sum(temp_table.df$Freq,na.rm=TRUE)
  return(c(unique_loc_kuni_gun.df$kuni_gun[i],Simpson_gun,parcel_count, holder_one, holder_two, holder_three, holder_four, holder_five, 
           holder_one_koku, holder_two_koku, holder_three_koku, holder_four_koku, holder_five_koku, total_kokudaka))
}

###


results <- mclapply(seq_along(unique_loc_kuni_gun.df$kuni_gun), gun_holder_finder)

## convert list to data frame and set varaible names
gun.df <- data.frame(matrix(unlist(results), byrow = TRUE, nrow = length(results)),stringsAsFactors = FALSE)
colnames(gun.df) <- c("kuni_gun","Simpson_gun","parcel_count_gun","holder_one_gun","holder_two_gun","holder_three_gun","holder_four_gun","holder_five_gun",
                          "holder_one_koku_gun","holder_two_koku_gun","holder_three_koku_gun","holder_four_koku_gun","holder_five_koku_gun","total_kokudaka_gun")


## assign categories to holders
gun.df$holder_one_cat_gun <- mapvalues(gun.df$holder_one_gun,category_table$standard_ryo_azukari,category_table$holding_category)
gun.df$holder_two_cat_gun <- mapvalues(gun.df$holder_two_gun,category_table$standard_ryo_azukari,category_table$holding_category)
gun.df$holder_three_cat_gun <- mapvalues(gun.df$holder_three_gun,category_table$standard_ryo_azukari,category_table$holding_category)
gun.df$holder_four_cat_gun <- mapvalues(gun.df$holder_four_gun,category_table$standard_ryo_azukari,category_table$holding_category)
gun.df$holder_five_cat_gun <- mapvalues(gun.df$holder_five_gun,category_table$standard_ryo_azukari,category_table$holding_category)


gun.df$holder_one_cat_gun <- ifelse(is.na(gun.df$holder_one_gun),NA,gun.df$holder_one_cat_gun)
gun.df$holder_two_cat_gun <- ifelse(is.na(gun.df$holder_two_gun),NA,gun.df$holder_two_cat_gun)
gun.df$holder_three_cat_gun <- ifelse(is.na(gun.df$holder_three_gun),NA,gun.df$holder_three_cat_gun)
gun.df$holder_four_cat_gun <- ifelse(is.na(gun.df$holder_four_gun),NA,gun.df$holder_four_cat_gun)
gun.df$holder_five_cat_gun <- ifelse(is.na(gun.df$holder_five_gun),NA,gun.df$holder_five_cat_gun)



village.df$kuni_gun <- paste(village.df$kuni,village.df$gun,sep="_")
combined.df <- left_join(village.df,gun.df,by="kuni_gun", keep=FALSE)

combined.df$lat <- as.numeric(combined.df$lat) ## corrects any stray numerics as strings
combined.df$lon <- as.numeric(combined.df$lon)


## radius calculations
## create Japan-center projection with planar coordinates
Japan.proj = CRS("+proj=lcc +lat_0=36 +lon_0=137 +lat_1=30 +lat_2=42 +x_0=0 +y_0=0 +datum=WGS84 +units=m +no_defs")
## https://proj.org/operations/projections/lcc.html
GCS <- CRS("+proj=longlat +ellps=WGS84 +datum=WGS84")

sample.df <- combined.df ## duplicate data frame 

sample.df <- sample.df[which(!is.na(sample.df$lat)),] #remove NAs
sample.df <- sample.df[which(!is.na(sample.df$lon)),]

parcels_clean.df <- Parcels.df ## duplicate data frame 
parcels_clean.df$lat <- as.numeric(parcels_clean.df$lat) # clean stray strings
parcels_clean.df$lon <- as.numeric(parcels_clean.df$lon) 

parcels_clean.df <- parcels_clean.df[which(!is.na(parcels_clean.df$lat)),] #remove NAs
parcels_clean.df <- parcels_clean.df[which(!is.na(parcels_clean.df$lon)),]

sample.sf <- sf::st_as_sf(sample.df, coords = c("lon","lat"), crs=GCS) ## creates sf object
sample.sf <- st_transform(sample.sf, crs = Japan.proj) ## converts to projection to allow radius in meters


parcels.sf <- sf::st_as_sf(parcels_clean.df, coords = c("lon","lat"), crs=GCS)
parcels.sf <- st_transform(parcels.sf, crs = Japan.proj)



buffer.sf <- st_buffer(sample.sf, 5000) ## create 5km radius around each location 
radius <- sf::st_intersects(buffer.sf, sample.sf) ## get all locations within 5km for each location


Simpson_radius <- function(i){
  temp.vec <- unlist(radius[[i]])
  places <- sample.sf$unique_location[temp.vec]
  temp.df <- parcels_clean.df[which(parcels_clean.df$unique_location%in%places),]
  temp_table.df <- data.frame(xtabs(temp.df$total_koku~temp.df$standard_ryo_azukari)) 
  ## create crosstab for 5km radius around each village
  temp_table.df$standard_ryo_azukari <- as.character(temp_table.df$temp.df.standard_ryo)
  temp_table.df$perc <- temp_table.df$Freq/sum(temp_table.df$Freq, na.rm=TRUE)
  Simpson <- as.numeric(sum(temp_table.df$perc^2))
  No_villages_in_radius <- as.numeric(length(unique(places)))
  Holders_in_radius <- as.numeric(length(temp_table.df$standard_ryo_azukari))
  Cluster_total_koku <- as.numeric(sum(temp_table.df$Freq, na.rm=TRUE))
  temp_table.df <- temp_table.df[order(-temp_table.df$Freq),]
  holder_one_Simpson_5k <- temp_table.df$standard_ryo_azukari[1]
  holder_two_Simpson_5k <- ifelse(temp_table.df$Freq[2]>0,temp_table.df$standard_ryo_azukari[2],NA)
  holder_three_Simpson_5k <- ifelse(temp_table.df$Freq[3]>0,temp_table.df$standard_ryo_azukari[3],NA)
  holder_four_Simpson_5k <- ifelse(temp_table.df$Freq[4]>0,temp_table.df$standard_ryo_azukari[4],NA)
  holder_five_Simpson_5k <- ifelse(temp_table.df$Freq[5]>0,temp_table.df$standard_ryo_azukari[5],NA)
  holder_one_koku_Simpson_5k <- as.numeric(temp_table.df$Freq[1])
  holder_two_koku_Simpson_5k <- as.numeric(ifelse(temp_table.df$Freq[2]>0,temp_table.df$Freq[2],NA))
  holder_three_koku_Simpson_5k <- as.numeric(ifelse(temp_table.df$Freq[3]>0,temp_table.df$Freq[3],NA))
  holder_four_koku_Simpson_5k <- as.numeric(ifelse(temp_table.df$Freq[4]>0,temp_table.df$Freq[4],NA))
  holder_five_koku_Simpson_5k <- as.numeric(ifelse(temp_table.df$Freq[5]>0,temp_table.df$Freq[5],NA))
  
  c(sample.sf$unique_location[i],Simpson,Cluster_total_koku,No_villages_in_radius,Holders_in_radius,
    holder_one_Simpson_5k, holder_two_Simpson_5k, holder_three_Simpson_5k, holder_four_Simpson_5k, holder_five_Simpson_5k, 
    holder_one_koku_Simpson_5k, holder_two_koku_Simpson_5k, holder_three_koku_Simpson_5k, holder_four_koku_Simpson_5k, holder_five_koku_Simpson_5k)
}


output <- mclapply(seq_along(radius), Simpson_radius)



output.df <- data.frame(matrix(unlist(output), nrow=length(output), byrow=TRUE),stringsAsFactors=FALSE)
# rename output and convert strings to numeric
output.df$unique_location <- output.df$X1
output.df$Simpson_5km_radius <- as.numeric(output.df$X2)
output.df$Total_koku_5km_radius <- as.numeric(output.df$X3)
output.df$Village_count_5km_radius <- as.numeric(output.df$X4)
output.df$Holder_count_5km_radius <- as.numeric(output.df$X5)
output.df$holder_one_Simpson_5k <- output.df$X6
output.df$holder_two_Simpson_5k <- output.df$X7
output.df$holder_three_Simpson_5k <- output.df$X8
output.df$holder_four_Simpson_5k <- output.df$X9
output.df$holder_five_Simpson_5k <- output.df$X10
output.df$holder_one_koku_Simpson_5k <- as.numeric(output.df$X11)
output.df$holder_two_koku_Simpson_5k <- as.numeric(output.df$X12)
output.df$holder_three_koku_Simpson_5k <- as.numeric(output.df$X13)
output.df$holder_four_koku_Simpson_5k <- as.numeric(output.df$X14)
output.df$holder_five_koku_Simpson_5k <- as.numeric(output.df$X15)


keep_output.df <- output.df[,16:30]


## this is now primary data source
all.df <- cbind.data.frame(sample.df,keep_output.df[,-1])

## combine the 5km Simpson calculations for each domain -- calculate mean
## these data are use for logit analysis of monopolies
## this analysis treats azukarichi as part of domains for Simpson 5km because they were under domain control
## BUT it excludes azukarichi from domain kokudaka because revenue went to shogunate
domain_level_data.df <- read.table("monopoly_data.csv", header = TRUE, quote="", sep=",")
domain_level_data.df$kokudaka <- NA ## empty vectors for loop
domain_level_data.df$Simpson_5km <- NA 

for (i in seq_along(domain_level_data.df$standard_domain_name)){
domain_level_data.df$Simpson_5km[i] <- mean(all.df$Simpson_5km_radius[which(all.df$holder_one==domain_level_data.df$standard_domain_name[i])], na.rm=TRUE)
domain_level_data.df$kokudaka[i] <- sum(Parcels.df$total_koku[which(Parcels.df$standard_domain_names==domain_level_data.df$standard_domain_name[i]
                                                                    &Parcels.df$azukari==FALSE)], na.rm=TRUE)
}
## clean data for numeric binary logit
domain_for_logit.df <- domain_level_data.df
domain_for_logit.df$daimyo_category <- ifelse(is.na(domain_for_logit.df$daimyo_category),"unknown",domain_for_logit.df$daimyo_category)
domain_for_logit.df$fudai <- ifelse(domain_for_logit.df$daimyo_category=="fudai",1,0)
domain_for_logit.df$tozama <- ifelse(domain_for_logit.df$daimyo_category=="tozama",1,0)

domain_for_logit.df <- domain_for_logit.df[which(domain_for_logit.df$kokudaka>=5000),]

## how many domains had monopolies
monopoly_percentage <- (1-length(domain_for_logit.df$standard_domain_name[which(domain_for_logit.df$monopoly==1)])/length(domain_for_logit.df$standard_domain_name))*100


model_one <- glm(domain_for_logit.df$monopoly~domain_for_logit.df$Simpson_5km + 
                   log10(domain_for_logit.df$kokudaka),
                 x = TRUE,
                 family = "binomial")

predicted_one <- predict(model_one,domain_for_logit.df, type="response")

## use standard 50% threshold for prediction
domain_for_logit.df$prediction_one <- ifelse(predicted_one>=0.5,1,0)
raw_accuracy_one <- length(which(domain_for_logit.df$monopoly==domain_for_logit.df$prediction_one))/length(domain_for_logit.df$monopoly)
con_matrix_one <- confusionMatrix(data = as.factor(domain_for_logit.df$prediction_one), reference = as.factor(domain_for_logit.df$monopoly))

# use caret and compute a confusion matrix
# and get logit parameters to compute logit line for graphing the 50% threshold
a <- 10^((log10(0.5) - model_one$coefficients[1] - model_one$coefficients[2]*0.01)/model_one$coefficients[3])
b <- 10^((log10(0.5) - model_one$coefficients[1] - model_one$coefficients[2])/model_one$coefficients[3])
logit_line.df <- data.frame("x"=c(0.01,1),y=c(a,b))

## subsamples as discussed in text
under_50000 <- mean(domain_for_logit.df$monopoly[which(domain_for_logit.df$kokudaka<50000)])*100
over_200000 <- mean(domain_for_logit.df$monopoly[which(domain_for_logit.df$kokudaka>200000)])*100

under_0.25 <- mean(domain_for_logit.df$monopoly[which(domain_for_logit.df$Simpson_5km<0.25)])*100
over_0.75 <- mean(domain_for_logit.df$monopoly[which(domain_for_logit.df$Simpson_5km>0.75)])*100



model_two  <- glm(monopoly~Simpson_5km + 
                    log10(kokudaka) + tozama,
                  family = "binomial", x = TRUE,
                  data=domain_for_logit.df)
predicted_two <- predict(model_two,domain_for_logit.df, type="response")

domain_for_logit.df$prediction_two <- ifelse(predicted_two>=0.5,1,0)
raw_accuracy_two <- length(which(domain_for_logit.df$monopoly==domain_for_logit.df$prediction_two))/length(domain_for_logit.df$monopoly)


con_matrix_two <- confusionMatrix(data = as.factor(domain_for_logit.df$prediction_two), reference = as.factor(domain_for_logit.df$monopoly))

logit2prob <- function(logit){
  odds <- exp(logit)
  prob <- odds / (1 + odds) * 100
  return(prob)
}

coeff_one.df <- data.frame(summary(model_one)$coefficients)
coeff_two.df <- data.frame(summary(model_two)$coefficients)

oddrat_one <- exp(coef(model_one))
oddrat_two <- exp(coef(model_two))

# mean marginal effects
mean_marginal_one <- (logitmfx(formula=monopoly~Simpson_5km + log10(kokudaka), data=domain_for_logit.df, atmean = FALSE))
mean_marginal_one.df <- tidy(mean_marginal_one)
# marginal effects at mean
marginal_at_mean_one <- (logitmfx(formula=monopoly~Simpson_5km + log10(kokudaka), data=domain_for_logit.df))
marginal_at_mean_one.df <- tidy(marginal_at_mean_one)

# mean marginal effects
mean_marginal_two <- (logitmfx(formula=monopoly~Simpson_5km + log10(kokudaka) + tozama, data=domain_for_logit.df, atmean = FALSE))
mean_marginal_two.df <- tidy(mean_marginal_two)
# marginal effects at mean
marginal_at_mean_two <- (logitmfx(formula=monopoly~Simpson_5km + log10(kokudaka) + tozama, data=domain_for_logit.df))
marginal_at_mean_two.df <- tidy(marginal_at_mean_two)


# different packages ... same results!
mfx_one_at_mean <- maBina(model_one, x.mean = TRUE) # marginal effects at mean
mfx_two_at_mean <- maBina(model_two, x.mean = TRUE)

mfx_one_mean <- maBina(model_one, x.mean = FALSE) # mean marginal effects
mfx_two_mean <- maBina(model_two, x.mean = FALSE)


wald_one <-wald.test(b=coef(model_one),Sigma = vcov(model_one), Terms=1:3)
wald_two <-wald.test(b=coef(model_two),Sigma = vcov(model_two), Terms=1:3)

stopCluster(cl)

#### below is file saving

file_date <- format(Sys.time(), "%Y_%m_%d")
csv_file_name <- paste0("Village_level_combined_",file_date,".csv")
txt_file_name <-paste0("Village_level_combined_",file_date,".txt")


write.table(all.df,file=csv_file_name, sep=",", col.names = TRUE, row.names = FALSE, quote = FALSE)
write.table(all.df,file=txt_file_name, sep="\t", col.names = TRUE, row.names = FALSE, quote = FALSE)

csv_file_name <- paste0("Domain_Simpson_",file_date,".csv")
txt_file_name <-paste0("Domain_Simpson_",file_date,".txt")


write.table(domain_for_logit.df,file=csv_file_name, sep=",", col.names = TRUE, row.names = FALSE, quote = FALSE)
write.table(domain_for_logit.df,file=txt_file_name, sep="\t", col.names = TRUE, row.names = FALSE, quote = FALSE)


